---
title: "Modelisation des données du Airbnb dans la ville de Québec"
output:
flexdashboard::flex_dashboard:
theme: cosmo
storyboard: true
orientation: rows #Columns
source_code: embed
vertical_layout: fill
---
```{r setup, include=FALSE}
#Si l'on veux afficher page par page ie Page Navigation on ajoute:{data-navmenu="Menu A"} sur
# {data-icon="fa-globe"} donc on aura {data-icon="fa-globe"} {data-navmenu="Menu A"}
#https://rmarkdown.rstudio.com/flexdashboard/using.html
#load("C:/Users/pierr/OneDrive/Documents/ubuntu/Concours_Gov/ds.RData")
#load("C:/Users/pierr/OneDrive/Documents/Article_projet/ds.RData")
library(flexdashboard)
#attach(ds)
library(ggpubr)
library(psych)
library(pander)
library(lubridate)
library(outliers)
library(knitr)
library(dplyr)
library(gmodels)
library(readxl)
library(tidyverse)
library(plotly)
library(sqldf)
library(tidyr)
library(Amelia)
library(GGally)
library(tibble)
library(magrittr)
library(forcats)
library(purrr)
library(readr)
library(DT)
library(MASS)
library(data.table)
library(ggplot2)
library(corrplot)
library(RCurl)
#library(googlesheets4)
library(here)
library(ggeasy) # for easy ggplot editing
library(harrypotter) # for palettes
#options(scipen = 999)
### Données de calendrier détaillées pour les inscriptions dans la ville de Québec
con1 <- gzcon(url(paste("http://data.insideairbnb.com/canada/qc/quebec-city/2021-04-11/data/","calendar.csv.gz", sep="")))
txt1 <- readLines(con1)
calendar <- read.csv(textConnection(txt1))
calendar$price=gsub('^.|.{3}$', '', calendar$price)
calendar$adjusted_price=gsub('^.|.{3}$', '', calendar$adjusted_price)
calendar$price=as.numeric(calendar$price)
calendar$date=as.Date(calendar$date)
calendar$adjusted_price=as.numeric(calendar$adjusted_price)
calendar$available=as.factor(calendar$available)
calendar$available <- ifelse(calendar$available == "t", 1, 0)
calendar$available=as.factor(calendar$available)
#calendar$month=months(calendar$date)
calendar$year=lubridate::year(calendar$date)
calendar$month=lubridate::month(calendar$date, label = TRUE, abbr = FALSE)
calendar$week=lubridate::wday(calendar$date, label = TRUE, abbr = FALSE)
init_calendar=calendar
attach(calendar)
## Données d'examen détaillées pour les inscriptions dans la ville de Québec
con2 <- gzcon(url(paste("http://data.insideairbnb.com/canada/qc/quebec-city/2021-04-11/data/","reviews.csv.gz", sep="")))
txt2 <- readLines(con2)
reviews <- read.csv(textConnection(txt2))
reviews$date=as.Date(reviews$date)
#reviews$month_reviews=months(reviews$date)
reviews$year=lubridate::year(reviews$date)
reviews$month=lubridate::month(reviews$date, label = TRUE, abbr = FALSE)
reviews$week=lubridate::wday(reviews$date, label = TRUE, abbr = FALSE)
init_reviews=reviews
attach(reviews)
### Informations sommaires et métriques pour les inscriptions à Québec (bonnes pour les visualisations.
###urlfile <-'http://data.insideairbnb.com/canada/qc/quebec-city/2021-04-11/visualisations/listings.csv'
#http://data.insideairbnb.com/canada/qc/quebec-city/2021-04-11/data/listings.csv.gz
# download the file
#downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
# treat the text data as a steam so we can read from it
#connection <- textConnection(downloaded)
# parse the downloaded data as CSV
#listing <- read.csv(connection, header=TRUE)
# preview the first 5 rows
### Informations sommaires et métriques pour les inscriptions à Québec (bonnes pour les visualisations.
con4 <- gzcon(url(paste("http://data.insideairbnb.com/canada/qc/quebec-city/2021-04-11/data/","listings.csv.gz", sep="")))
txt4 <- readLines(con4)
listing <- read.csv(textConnection(txt4))
listing_init=listing
listing1=subset(listing,select=c(neighbourhood_cleansed,property_type,room_type,accommodates,bedrooms,beds,price,availability_30,availability_60,availability_365,number_of_reviews,review_scores_rating,reviews_per_month))
listing1$price=gsub('^.|.{3}$', '', listing1$price)
listing1$price=as.numeric(listing1$price)
listing=subset(listing, select=-c(description,neighborhood_overview,host_id,host_url,host_name,host_since,
host_location,bathrooms,neighbourhood_group_cleansed, host_about,host_neighbourhood,neighbourhood,latitude,longitude,calendar_updated,has_availability,
calendar_last_scraped,first_review,last_review,license,host_verifications,amenities,scrape_id))
listing$last_scraped=as.Date(listing$last_scraped)
listing$picture_url <- ifelse(listing$picture_url != "", 1, 0)
listing$host_picture_url <- ifelse(listing$host_picture_url != "", 1, 0)
listing$host_acceptance_rate <- as.numeric(sub("%", "", listing$host_acceptance_rate))
listing$host_response_rate <- as.numeric(sub("%", "", listing$host_response_rate))
listing$host_response_time <- as.factor(listing$host_response_time)
listing$price=gsub('^.|.{3}$', '', listing$price)
listing$price=as.numeric(listing$price)
listing$host_is_superhost <- ifelse(listing$host_is_superhost == "t", 1, 0)
listing$host_thumbnail_url <- ifelse(listing$host_thumbnail_url != "", 1, 0)
listing$price=as.numeric(listing$price)
listing$listing_url <- ifelse(listing$listing_url != "", 1, 0)
listing$instant_bookable <- ifelse(listing$instant_bookable == "t", 1, 0)
listing$host_has_profile_pic <- ifelse(listing$host_has_profile_pic == "t", 1, 0)
listing$host_identity_verified <- ifelse(listing$host_identity_verified == "t", 1, 0)
listing$bedrooms=as.numeric(listing$bedrooms)
listing$beds=as.numeric(listing$beds)
listing$price=as.numeric(listing$price)
listing$minimum_nights=as.numeric(listing$minimum_nights)
listing$maximum_nights=as.numeric(listing$maximum_nights)
listing$minimum_minimum_nights=as.numeric(listing$minimum_minimum_nights)
listing$maximum_minimum_nights =as.numeric(listing$maximum_minimum_nights)
listing$ minimum_maximum_nights=as.numeric(listing$minimum_maximum_nights)
listing$maximum_maximum_nights=as.numeric(listing$maximum_maximum_nights)
listing$availability_30 =as.numeric(listing$availability_30)
listing$availability_60=as.numeric(listing$availability_60)
listing$availability_90=as.numeric(listing$availability_90)
listing$availability_365=as.numeric(listing$availability_365)
listing$number_of_reviews=as.numeric(listing$number_of_reviews)
listing$number_of_reviews_ltm =as.numeric(listing$number_of_reviews_ltm)
listing$number_of_reviews_l30d =as.numeric(listing$number_of_reviews_l30d)
listing$review_scores_rating=as.numeric(listing$review_scores_rating)
listing$review_scores_accuracy=as.numeric(listing$review_scores_accuracy)
listing$review_scores_cleanliness=as.numeric(listing$review_scores_cleanliness)
listing$review_scores_checkin=as.numeric(listing$review_scores_checkin)
listing$review_scores_communication=as.numeric(listing$review_scores_communication)
listing$review_scores_location=as.numeric(listing$ review_scores_location)
listing$review_scores_value =as.numeric(listing$review_scores_value )
listing$calculated_host_listings_count=as.numeric(listing$calculated_host_listings_count)
listing$calculated_host_listings_count_entire_homes=as.numeric(listing$calculated_host_listings_count_entire_homes)
listing$calculated_host_listings_count_private_rooms=as.numeric(listing$calculated_host_listings_count_private_rooms)
listing$calculated_host_listings_count_shared_rooms=as.numeric(listing$calculated_host_listings_count_shared_rooms)
listing$host_thumbnail_url=as.factor(listing$host_thumbnail_url)
listing$picture_url=as.factor(listing$picture_url)
listing$host_picture_url=as.factor(listing$host_picture_url )
listing$host_response_time=as.factor(listing$host_response_time)
listing$host_thumbnail_url=as.factor(listing$host_thumbnail_url )
listing$price=as.numeric(listing$price)
listing$listing_url=as.factor(listing$listing_url )
listing$instant_bookable=as.factor(listing$instant_bookable)
listing$host_has_profile_pic=as.factor(listing$host_has_profile_pic)
listing$host_identity_verified=as.factor(listing$host_identity_verified)
listing=subset(listing,select=-c(review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_response_rate,host_acceptance_rate))
attach(listing)
attach(listing1)
listing2 = listing1[!as.numeric(listing1$property_type) %in%
which(table(listing1$property_type) <= 5), ]
listing2$property_type = droplevels(listing2$property_type)
attach(listing2)
panderOptions('table.split.table', 300)
#panderOptions("table.style", "grid")
panderOptions("table.style", "rmarkdown")
results = 'asis'
# Make googlesheets4 not try to authenticate, since we're using a public sheet
#sheets_deauth()
```
Resume sommaire des données {data-icon="fa-globe"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### Presentation de notre ensemble de données
```{r}
newhires=paste("Plus precisement à la date de compilation du 11 Avril 2021")
valueBox(value = cat(newhires),icon = "fa-user-plus",caption = "",color = "black")
```
Row {data-width=150}
-----------------------------------------------------------------------
### listings total en date du 11 avril 2021 au Québec
```{r}
newhires=paste("nous avons initialement",nrow(listing_init))
valueBox(value = newhires,icon = "fa-user-plus",caption = "listings",color = "green")
```
### listings total en date du 11 avril 2021 au Québec
```{r}
newhires=paste("avec",nrow(reviews))
valueBox(value = newhires,icon = "fa-user-plus",caption = "revues",color = "green")
```
### listings total en date du 11 avril 2021 au Québec
```{r}
newhires=paste("et",length(calendar$listing_id))
valueBox(value = newhires,icon = "fa-user-plus",caption = "sollicitaions",color = "green")
```
Row {data-width=150}
-----------------------------------------------------------------------
### Nombre de variables initiales
```{r}
newhires=paste("nous avons initialement",ncol(listing_init))
valueBox(value = newhires,icon = "fa-user-plus",caption = "variables pour les listing",color = "coral")
```
### listings total en date du 11 avril 2021 au Québec
```{r}
newhires=paste("avec",ncol(init_reviews))
valueBox(value = newhires,icon = "fa-user-plus",caption = "variables pour revues",color = "coral")
```
### listings total en date du 11 avril 2021 au Québec
```{r}
newhires=paste("et",ncol(init_calendar))
valueBox(value = newhires,icon = "fa-user-plus",caption = "variables pour la table calendar",color = "coral")
```
Row {data-width=150}
-----------------------------------------------------------------------
### Allure des disponibilités par mois
```{r,echo=FALSE,fig.height = 4, dev = 'jpeg'}
calendar$month = factor(calendar$month, levels = month.name)
h1=calendar %>% group_by(month,available) %>%
summarise(count=n())
p1=plot_ly(data = h1,
x=h1$month,
y=h1$count) %>%
add_lines(linetype = h1$available,
data = h1$count,
hoverinfo="text",
text=paste(h1$count)) %>%
layout(xaxis=list(title="Month"),
yaxis=list(title="Count"))
ggplotly(p1,dynamicTicks = TRUE)%>%
rangeslider() %>%
layout(hovermode = "x")
```
### les revues mensuels en fonction du prix et disponibilité annuelle par type de chambre
```{r,echo=FALSE}
p <- listing2 %>%
ggplot( aes(price, reviews_per_month, size =availability_365,color=room_type)) +
geom_point() +
theme_bw()
ggplotly(p,dynamicTicks = TRUE)%>%
rangeslider() %>%
layout(hovermode = "x")
```
visulisation Barplots {data-navmenu="presentation boxplots"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### Barplots
```{r,echo=FALSE}
interval2=paste("Barplots")
valueBox(value = interval2,icon = "fa-user-times",caption = "Boite à moustache", color = "green")
#Resume visualisation {data-icon="fa-globe"} {data-navmenu="Menu B"}
```
Row {data-width=150}
-----------------------------------------------------------------------
### Barplots
```{r,echo=FALSE}
#https://plotly.com/r/bar-charts/
fig <- reviews %>% count(month, week)
fig <- fig %>% plot_ly(x = ~month, y = ~n, color = ~week)
p <- ggplotly(fig,dynamicTicks = TRUE) %>%
rangeslider() %>%
layout(hovermode = "x")
p
```
### mois et annees
```{r,echo=FALSE}
#https://plotly.com/r/bar-charts/
ff=reviews
ff$year=as.factor(reviews$year)
fig <- ff %>% count(year, month)
fig <- fig %>% plot_ly(x = ~year, y = ~n, color = ~month)
p <- ggplotly(fig,dynamicTicks = TRUE) %>%
rangeslider() %>%
layout(hovermode = "x")
p
```
visulisation densité {data-navmenu="presentation densité"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### Cas des variables continues
```{r,echo=FALSE}
interval2=paste("distribution pour variable continue")
valueBox(value = interval2,icon = "fa-user-times",caption = "Boite à moustache", color = "green")
#Resume visualisation {data-icon="fa-globe"} {data-navmenu="Menu B"}
```
Row {data-width=150}
-----------------------------------------------------------------------
### Cas continue
```{r,echo=FALSE}
p <- ggplot(listing2, aes(x =reviews_per_month )) +
stat_density(aes(group =room_type, color = room_type),position="identity",geom="line")
fig <- ggplotly(p,dynamicTicks = TRUE) %>%
rangeslider() %>%
layout(hovermode = "x")
fig
```
### continue2
```{r,echo=FALSE}
p <- ggplot(listing2, aes(x = price)) +
geom_density(aes(fill = room_type), alpha = 0.5) +
ggtitle("Kernel Density estimates by group")
fig <- ggplotly(p,dynamicTicks = TRUE) %>%
rangeslider() %>%
layout(hovermode = "x")
fig
```
Distribution des données {data-navmenu="presentation distribution"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### Cas des variables catégorielles
```{r,echo=FALSE}
interval2=paste("distribution par classe")
valueBox(value = interval2,icon = "fa-user-times",caption = "Boite à moustache", color = "green")
#Resume visualisation {data-icon="fa-globe"} {data-navmenu="Menu B"}
```
Row {data-width=150}
-----------------------------------------------------------------------
### boxplots du logarithme du price par quartier
```{r,echo=FALSE}
#listing1$price=gsub('^.|.{3}$', '', listing1$price)
#listing1$price=as.numeric(listing1$price)
##table(listing1$property_type)
#listing2 = listing1[!as.numeric(listing1$property_type) %in%
# which(table(listing1$property_type) <= 5), ]
#listing2$property_type = droplevels(listing2$property_type)
#g<-qplot(room_type,log(price), data=listing2,geom=c("boxplot"),fill=neighbourhood_cleansed)
#ggplotly(g)
fig <- plot_ly(listing2, y = ~price, color = ~room_type, type = "box")
fig
```
### boxplot du prix fonction du quartier
```{r,echo=FALSE}
#g<-qplot(room_type,log(price), data=listing2,geom=c("boxplot"),fill=property_type)
#ggplotly(g)
fig <- plot_ly(listing2, y = ~price, color = ~neighbourhood_cleansed, type = "box")
fig
```
visulisation univariée densité {data-navmenu="presentation sommaire"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### reviews_per_month versus room_type
```{r,echo=FALSE}
p <- ggplot(listing2, aes(x =reviews_per_month )) +
stat_density(aes(group =room_type, color = room_type),position="identity",geom="line")
fig <- ggplotly(p,dynamicTicks = TRUE) %>%
rangeslider() %>%
layout(hovermode = "x")
fig
```
Correlation des données {data-navmenu="Analyse des données"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### Cas des variable numeriques
```{r}
interval2=paste("Correlation des variables numeriques")
valueBox(value = interval2,icon = "fa-user-times",caption = "Boite à moustache", color = "blue")
#Resume visualisation {data-icon="fa-globe"} {data-navmenu="Menu B"}
```
Row {data-width=150}
-----------------------------------------------------------------------
### Matrice de correlation
```{r,echo=FALSE}
dd=listing2 %>% dplyr::select(where(is.numeric))
d1=as.matrix(na.omit(dd))
dv=na.omit(listing2)
# Correlograms represent pairwise correlations between each variable.
mydata.cor = cor(d1, method = c("spearman"))
knitr::kable(mydata.cor)
#corrplot(mydata.cor)
#col = colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#corrplot(cor(as.matrix(d1)), method = "color", col = col(200),
# type = "upper", order = "hclust",
# addCoef.col = "black",
# tl.col = "black", tl.srt = 45,
# diag = FALSE)
```
### Visualisation de la matrice de corrélation
```{r,echo=FALSE}
dd=listing2 %>% dplyr::select(where(is.numeric))
d1=as.matrix(na.omit(dd))
#dv=na.omit(listing2)
# Correlograms represent pairwise correlations between each variable.
mydata.cor = cor(d1, method = c("spearman"))
corrplot(mydata.cor)
#col = colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#corrplot(cor(as.matrix(d1)), method = "color", col = col(200),
# type = "upper", order = "hclust",
# addCoef.col = "black",
# tl.col = "black", tl.srt = 45,
# diag = FALSE)
```
Mesure de tendance et de dispersion {data-navmenu="presentation sommaire"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### statistique descriptives du price,availability_30,availability_60,availability_365,number_of_reviews
```{r}
ds=na.omit(listing2)
pander::pander(describeBy(ds[,c('price','availability_30','availability_60','availability_365','number_of_reviews')],ds$room_type))
#listing1=subset(listing,select=c(neighbourhood_cleansed,property_type,room_type,accommodates,bedrooms,
#beds,price,availability_30,availability_60,availability_365,number_of_reviews,review_scores_rating,
#reviews_per_month))
```
resumé sommaire des données {data-navmenu="Analyse exploration des données"}
=============================
Row {data-width=150}
-----------------------------------------------------------------------
### distribution par temps de reponse de l'administrateur
```{r,echo=FALSE}
#y <- listing$host_response_time
#cbind(freq=table(y), percentage=round(prop.table(table(y))*100,1))
pull(listing, host_response_time)%>%
table()%>%
as.data.frame()%>%
datatable(rownames=FALSE,colnames=c("temps de reponse","Frequence"))
```